Load required libraries.

library(ggplot2)
library(grid)
library(magrittr)
library(dplyr)
library(tidyr)
library(quadprog)
source('data_loading.R')

Load in Wordbank common data.

wordbank <- connect.to.wordbank("prod")

common.tables <- get.common.tables(wordbank)

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap,
                       common.tables$category) %>%
  filter(type == "word")

instrument.tables <- get.instrument.tables(wordbank, common.tables$instruments)

Get vocabulary composition data for all languages.

get.vocab.comp <- function(input_language, input_form) {
  
  lang.vocab.items <- filter(items, language == input_language, form == input_form) %>%
    filter(lexical_category %in% c("nouns", "predicates", "function_words")) %>%
    rename(column = item.id) %>%
    mutate(item.id = as.numeric(substr(column, 6, nchar(column))))
  
  lang.instrument.table <- filter(instrument.tables,
                                  language == input_language,
                                  form == input_form)$table[[1]]
  
  lang.vocab.data <- get.instrument.data(lang.instrument.table,
                                         lang.vocab.items$column) %>%
    left_join(select(lang.vocab.items, item.id, lexical_category, item, definition)) %>%
    mutate(value = ifelse(is.na(value), "", value),
           produces = value == "produces",
           understands = value == "produces" | value == "understands")
  
  num.words <- nrow(lang.vocab.items)
  
  lang.vocab.summary <- lang.vocab.data %>%
    group_by(data_id, lexical_category) %>%
    summarise(production.num = sum(produces),
              production.prop = sum(produces) / length(produces),
              comprehension.num = sum(understands),
              comprehension.prop = sum(understands) / length(understands))
  
  lang.vocab.sizes <- lang.vocab.summary %>%
    summarise(production.vocab = sum(production.num) / num.words,
              comprehension.vocab = sum(comprehension.num) / num.words)
  
  lang.vocab.summary %>%
    left_join(lang.vocab.sizes) %>%
    select(-production.num, -comprehension.num) %>%
    mutate(language = input_language,
           form = input_form)
  
  }
form.vocab.comp <- function(input_form) {
  bind_rows(sapply(unique(filter(instrument.tables, form == input_form)$language),
                   function(lang) get.vocab.comp(lang, input_form),
                   simplify = FALSE)) %>%
    mutate(lexical_category = factor(lexical_category,
                                     levels=c("nouns", "predicates", "function_words"),
                                     labels=c("Nouns", "Predicates", "Function Words"))) %>%
    gather(measure.var, value,
           production.prop, production.vocab,
           comprehension.prop, comprehension.vocab) %>%
    extract(measure.var, c("measure", "var"), "([[:alnum:]]+)\\.([[:alnum:]]+)") %>%
    spread(var, value)
  }

wg.vocab.comp <- form.vocab.comp("WG")
ws.vocab.comp <- form.vocab.comp("WS") %>%
  filter(measure == "production")

vocab.comp <- bind_rows(ws.vocab.comp, wg.vocab.comp)

Show sample size of each instrument.

sample_sizes <- vocab.comp %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  select(language, form, n) %>%
  distinct()

kable(sample_sizes)
language form n
BSL WG 162
Cantonese WS 987
Croatian WG 250
Croatian WS 377
Danish WS 3714
English WG 2416
English WS 5824
German WS 1183
Hebrew WG 63
Hebrew WS 253
Italian WS 752
Mandarin WS 1056
Norwegian WG 3025
Norwegian WS 12969
Russian WG 768
Russian WS 1037
Spanish WG 778
Spanish WS 1094
Swedish WG 474
Swedish WS 900
Turkish WG 1115
Turkish WS 2422

Base plot for looking at vocabulary composition.

base_plot <- function(input_form, input_measure) {
  ggplot(filter(vocab.comp, form == input_form, measure == input_measure),
         aes(x = vocab, y = prop, colour = lexical_category)) +
  facet_wrap(~ language) +
  geom_jitter(size = 0.7) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "\nVocabulary Size") +
  geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
  theme_bw(base_size = 12) + 
  theme(legend.position = c(0.068, 0.95),
        legend.text = element_text(size = 9),
        legend.title = element_text(size = 9, lineheight = unit(0.8, "char")),
        legend.key.height = unit(0.8, "char"),
        legend.key.width = unit(0.3, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"),
        text = element_text(family = font)) +
  scale_color_brewer(palette = "Set1", name = "Lexical Category")
}

Plot WS productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WS", "production") + geom_jitter(size = 0.7)

Plot WG productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "production") + geom_jitter(size = 0.7)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "comprehension") + geom_jitter(size = 0.7)

Make an S3 class for a clm (constrained linear model), which uses quadratic programming to run a regression on data with a specified formula, subject to the constraint that the coefficients of the regression sum to 1 (in the future could support arbitrary constraints on the coefficients).

clm <- function(formula, data, ...) {
  M <- model.frame(formula, data)
  y <- M[,1]
  X <- as.matrix(M[,-1])
  s <- solve.QP(t(X) %*% X, t(y) %*% X, matrix(1, nr=ncol(X), nc=1), 1, meq=1)
  class(s) <- "clm"
  s$formula <- formula
  return(s)
  }

# S3 predict method for clm
predict.clm <- function(object, newdata) {
  M <- as.matrix(model.frame(object$formula[-2], newdata))
  s <- object$solution
  p <- (M %*% s)
  rownames(p) <- NULL
  p[,1]
  }

# S3 predictdf method for clm (called by stat_smooth)
predictdf.clm <- function(model, xseq, se, level) {
  pred <- predict(model, newdata = data.frame(x = xseq))
  data.frame(x = xseq, y = as.vector(pred))
  }

Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WS", "production") +
  geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)

Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "production") +
  geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "comprehension") +
  geom_smooth(method = "clm", formula = y ~ I(x^3) + I(x^2) + x - 1)

Function for resampling data.

sample.areas <- function(d, num.times=1000) {
  
  poly.area <- function(group.data) {
    model = clm(prop ~ I(vocab^3) + I(vocab^2) + vocab - 1,
                data = group.data)
    return((model$solution %*% c(1/4,1/3,1/2) - 0.5)[1])
  }
  
  counter = 1
  sample.area <- function(d) {
    d.frame <- d %>%
      group_by(language, form, measure) %>%
      sample_frac(replace = TRUE) %>% # resample kids
      group_by(language, form, measure, lexical_category) %>%
      do(area = poly.area(.)) %>%
      mutate(area = area[1]) %>%
      rename_(.dots = setNames("area", counter))

    counter <<- counter + 1 # increment counter outside scope
    return(d.frame)
  }

  areas <- replicate(num.times, sample.area(d), simplify=FALSE)
  
  Reduce(left_join, areas) %>%
    gather(sample, area, -language, -form, -measure, -lexical_category)
}

Resample data, compute area for each sample, find the mean and CI of the area estimate.

areas <- sample.areas(vocab.comp, 100)

area.summary <- areas %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(mean =  mean(area),
            ci.high = quantile(area, 0.975),
            ci.low = quantile(area, 0.025)) %>%
  ungroup() %>%
  mutate(language = factor(language))

#nouns <- filter(area.summary, lexical_category == "Nouns")
#noun.levels <- nouns$language[order(nouns$mean, nouns$language, decreasing = FALSE)]
#area.summary %<>% ungroup() %>% mutate(language = factor(language, levels = noun.levels))

Plot each language’s area estimate by lexical category.

ggplot(area.summary, aes(y = language, x = mean, col = lexical_category)) +
  facet_grid(lexical_category ~ form + measure) +
  geom_point() +
  geom_segment(aes(x = ci.low, xend = ci.high,
                   y = language, yend = language)) +
  geom_hline(aes(y = 0), color = "gray", linetype = "dashed") +
  scale_colour_brewer(palette = "Set1", name = "", guide=FALSE) +
  scale_y_discrete(limits = rev(levels(area.summary$language))) +
  theme_bw() +
  theme(text = element_text(family = font)) + 
  geom_vline(xintercept=0, linetype="dotted") + 
  ylab("") +
  xlab("\nRelative representation in early vocabulary")